{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "8456b5b6-185b-440b-ab98-1822aac2fe4f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('documents-with-ids.json', 'rt') as f_in:\n",
    "    documents = json.load(f_in)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "620ed5a1-cc06-40a3-8627-891aed525cba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from elasticsearch import Elasticsearch\n",
    "\n",
    "es_client = Elasticsearch('http://localhost:9200') \n",
    "\n",
    "index_settings = {\n",
    "    \"settings\": {\n",
    "        \"number_of_shards\": 1,\n",
    "        \"number_of_replicas\": 0\n",
    "    },\n",
    "    \"mappings\": {\n",
    "        \"properties\": {\n",
    "            \"text\": {\"type\": \"text\"},\n",
    "            \"section\": {\"type\": \"text\"},\n",
    "            \"question\": {\"type\": \"text\"},\n",
    "            \"course\": {\"type\": \"keyword\"},\n",
    "            \"id\": {\"type\": \"keyword\"},\n",
    "        }\n",
    "    }\n",
    "}\n",
    "\n",
    "index_name = \"course-questions\"\n",
    "\n",
    "es_client.indices.delete(index=index_name, ignore_unavailable=True)\n",
    "es_client.indices.create(index=index_name, body=index_settings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "58a9daf0-9b68-48fb-b992-59782b625e33",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b7bffc1ed5fd4de2a26975f894976043",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/948 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from tqdm.auto import tqdm\n",
    "\n",
    "for doc in tqdm(documents):\n",
    "    es_client.index(index=index_name, document=doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "08c8586c-2976-4746-9b2d-0442c360211d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def elastic_search(query, course):\n",
    "    search_query = {\n",
    "        \"size\": 5,\n",
    "        \"query\": {\n",
    "            \"bool\": {\n",
    "                \"must\": {\n",
    "                    \"multi_match\": {\n",
    "                        \"query\": query,\n",
    "                        \"fields\": [\"question^3\", \"text\", \"section\"],\n",
    "                        \"type\": \"best_fields\"\n",
    "                    }\n",
    "                },\n",
    "                \"filter\": {\n",
    "                    \"term\": {\n",
    "                        \"course\": course\n",
    "                    }\n",
    "                }\n",
    "            }\n",
    "        }\n",
    "    }\n",
    "\n",
    "    response = es_client.search(index=index_name, body=search_query)\n",
    "    \n",
    "    result_docs = []\n",
    "    \n",
    "    for hit in response['hits']['hits']:\n",
    "        result_docs.append(hit['_source'])\n",
    "    \n",
    "    return result_docs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d7ec3171-f2cc-4be5-8de5-4e8131cdeccb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'text': \"Yes, even if you don't register, you're still eligible to submit the homeworks.\\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.\",\n",
       "  'section': 'General course-related questions',\n",
       "  'question': 'Course - Can I still join the course after the start date?',\n",
       "  'course': 'data-engineering-zoomcamp',\n",
       "  'id': '7842b56a'},\n",
       " {'text': 'You can start by installing and setting up all the dependencies and requirements:\\nGoogle cloud account\\nGoogle Cloud SDK\\nPython 3 (installed with Anaconda)\\nTerraform\\nGit\\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',\n",
       "  'section': 'General course-related questions',\n",
       "  'question': 'Course - What can I do before the course starts?',\n",
       "  'course': 'data-engineering-zoomcamp',\n",
       "  'id': '63394d91'},\n",
       " {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',\n",
       "  'section': 'General course-related questions',\n",
       "  'question': 'Course - Can I follow the course after it finishes?',\n",
       "  'course': 'data-engineering-zoomcamp',\n",
       "  'id': 'a482086d'},\n",
       " {'text': 'Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check the FAQ (this document), most likely all your questions are already answered here.\\nYou can also tag the bot @ZoomcampQABot to help you conduct the search, but don’t rely on its answers 100%, it is pretty good though.',\n",
       "  'section': 'General course-related questions',\n",
       "  'question': 'Course - Can I get support if I take the course in the self-paced mode?',\n",
       "  'course': 'data-engineering-zoomcamp',\n",
       "  'id': 'eb56ae98'},\n",
       " {'text': \"You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.\",\n",
       "  'section': 'General course-related questions',\n",
       "  'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',\n",
       "  'course': 'data-engineering-zoomcamp',\n",
       "  'id': '0bbf41ec'}]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "elastic_search(\n",
    "    query=\"I just discovered the course. Can I still join?\",\n",
    "    course=\"data-engineering-zoomcamp\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "163d5e97-e1b7-45e1-ba1e-61fca4fc37a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e3da963b-cd17-471b-9cff-7f8c4276b915",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ground_truth = pd.read_csv('ground-truth-data.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "d53d3beb-d699-46c8-bc85-953afc4ef48d",
   "metadata": {},
   "outputs": [],
   "source": [
    "ground_truth = df_ground_truth.to_dict(orient='records')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "1e08bcee-4c70-44b8-9525-ff89d9b2320b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "df5f0694de064d4fb7f220c79f6998c8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/4627 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "relevance_total = []\n",
    "\n",
    "for q in tqdm(ground_truth):\n",
    "    doc_id = q['document']\n",
    "    results = elastic_search(query=q['question'], course=q['course'])\n",
    "    relevance = [d['id'] == doc_id for d in results]\n",
    "    relevance_total.append(relevance)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "83e95114-afdb-40de-b044-0d83d07722c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "example = [\n",
    "    [True, False, False, False, False], # 1, \n",
    "    [False, False, False, False, False], # 0\n",
    "    [False, False, False, False, False], # 0 \n",
    "    [False, False, False, False, False], # 0\n",
    "    [False, False, False, False, False], # 0 \n",
    "    [True, False, False, False, False], # 1\n",
    "    [True, False, False, False, False], # 1\n",
    "    [True, False, False, False, False], # 1\n",
    "    [True, False, False, False, False], # 1\n",
    "    [True, False, False, False, False], # 1 \n",
    "    [False, False, True, False, False],  # 1/3\n",
    "    [False, False, False, False, False], # 0\n",
    "]\n",
    "\n",
    "# 1 => 1\n",
    "# 2 => 1 / 2 = 0.5\n",
    "# 3 => 1 / 3 = 0.3333\n",
    "# 4 => 0.25\n",
    "# 5 => 0.2\n",
    "# rank => 1 / rank\n",
    "# none => 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "8959c9ff-5bbe-4729-8fa3-cdc51ed10f5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def hit_rate(relevance_total):\n",
    "    cnt = 0\n",
    "\n",
    "    for line in relevance_total:\n",
    "        if True in line:\n",
    "            cnt = cnt + 1\n",
    "\n",
    "    return cnt / len(relevance_total)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "842255b5-18f2-4102-9689-a5835e0a621c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def mrr(relevance_total):\n",
    "    total_score = 0.0\n",
    "\n",
    "    for line in relevance_total:\n",
    "        for rank in range(len(line)):\n",
    "            if line[rank] == True:\n",
    "                total_score = total_score + 1 / (rank + 1)\n",
    "\n",
    "    return total_score / len(relevance_total)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "d56661b2-d5ee-4683-9514-94b825da7ae3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5833333333333334"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hit_rate(example)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "dc1348d1-6dfa-4f80-a532-0dd423b5333d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5277777777777778"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mrr(example)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "46538d7c-4e5c-40c4-be02-836880ff7ca3",
   "metadata": {},
   "source": [
    "- hit-rate (recall)\n",
    "- Mean Reciprocal Rank (mrr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "e12883f1-13c1-4fe4-83ac-a4833d1cc97b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.7395720769397017, 0.6032418413658963)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hit_rate(relevance_total), mrr(relevance_total)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "d10bb1fd-6139-4450-864e-23f7b8531dee",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<minsearch.Index at 0x29109dae150>"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import minsearch\n",
    "\n",
    "index = minsearch.Index(\n",
    "    text_fields=[\"question\", \"text\", \"section\"],\n",
    "    keyword_fields=[\"course\", \"id\"]\n",
    ")\n",
    "\n",
    "index.fit(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "e3d12e7b-eb97-4d47-b77f-c8cbb0d0ed73",
   "metadata": {},
   "outputs": [],
   "source": [
    "def minsearch_search(query, course):\n",
    "    boost = {'question': 3.0, 'section': 0.5}\n",
    "\n",
    "    results = index.search(\n",
    "        query=query,\n",
    "        filter_dict={'course': course},\n",
    "        boost_dict=boost,\n",
    "        num_results=5\n",
    "    )\n",
    "\n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "351255d3-2fa5-400e-954a-72c115e94637",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3fdae1fbca2b4cd6b887b751364aec06",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/4627 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "relevance_total = []\n",
    "\n",
    "for q in tqdm(ground_truth):\n",
    "    doc_id = q['document']\n",
    "    results = minsearch_search(query=q['question'], course=q['course'])\n",
    "    relevance = [d['id'] == doc_id for d in results]\n",
    "    relevance_total.append(relevance)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "b9c2a382-edce-4e6a-8d2e-caabf43f4483",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.7722066133563864, 0.661454506159499)"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hit_rate(relevance_total), mrr(relevance_total)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dbf29cd6-705b-4076-9aef-098dcfaceaf1",
   "metadata": {},
   "source": [
    "Compare with ES results:\n",
    "```\n",
    "(0.7395720769397017, 0.6032418413658963)\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "f11baaff-43d9-4b8c-a896-561b86e85743",
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate(ground_truth, search_function):\n",
    "    relevance_total = []\n",
    "\n",
    "    for q in tqdm(ground_truth):\n",
    "        doc_id = q['document']\n",
    "        results = search_function(q)\n",
    "        relevance = [d['id'] == doc_id for d in results]\n",
    "        relevance_total.append(relevance)\n",
    "\n",
    "    return {\n",
    "        'hit_rate': hit_rate(relevance_total),\n",
    "        'mrr': mrr(relevance_total),\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "611846a8-a977-4483-bd01-61575821c18c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "21adf8ef14ad490992a05478d63715f1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/4627 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'hit_rate': 0.7395720769397017, 'mrr': 0.6032418413658963}"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "evaluate(ground_truth, lambda q: elastic_search(q['question'], q['course']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "14cab5ff-f42e-4030-b7ba-6edf47d73d21",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "40cdeba982f74f998c54fa301d4898f0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/4627 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'hit_rate': 0.7722066133563864, 'mrr': 0.661454506159499}"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1d3530e-1406-49dd-bba9-914f6a39d7f2",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
